{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "5648949c-5240-4897-aefb-41e4c3f89c83",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "F:\\programData\\conda\\env\\ChatGLM2-6B\\lib\\site-packages\\numpy\\_distributor_init.py:30: UserWarning: loaded more than 1 DLL from .libs:\n",
      "F:\\programData\\conda\\env\\ChatGLM2-6B\\lib\\site-packages\\numpy\\.libs\\libopenblas.FB5AE2TYXYH2IJRDKGDGQ3XBKLKTF43H.gfortran-win_amd64.dll\n",
      "F:\\programData\\conda\\env\\ChatGLM2-6B\\lib\\site-packages\\numpy\\.libs\\libopenblas64__v0.3.23-246-g3d31191b-gcc_10_3_0.dll\n",
      "  warnings.warn(\"loaded more than 1 DLL from .libs:\"\n"
     ]
    }
   ],
   "source": [
    "import collections\n",
    "import re\n",
    "from d2l import torch as d2l"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "033ef8fc-9bb3-4187-945b-c8f1e1eb97f1",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "# 文本总行数：3221\n",
      "the time machine, by h. g. wells [1898]\n",
      "twinkled, and his usually pale face was flushed and animated. the\n"
     ]
    }
   ],
   "source": [
    "#8.2.1. 读取数据集\n",
    "d2l.DATA_HUB['time_machine'] = (d2l.DATA_URL + 'timemachine.txt',\n",
    "                                '090b5e7e70c295757f55df93cb0a180b9691891a')\n",
    "def read_time_machine():\n",
    "    with open(d2l.download('time_machine'),'r') as f:\n",
    "        lines = f.readlines()\n",
    "    return [re.sub('^A-Za-z'+',',' ',line).strip().lower() for line in lines]\n",
    "\n",
    "lines = read_time_machine()\n",
    "print(f\"# 文本总行数：{len(lines)}\")\n",
    "print(lines[0])\n",
    "print(lines[10])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "b5dbf7f5-1759-422e-8469-ed35674c51bb",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['the', 'time', 'machine,', 'by', 'h.', 'g.', 'wells', '[1898]']\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "[]\n",
      "['i']\n",
      "[]\n",
      "[]\n",
      "['the', 'time', 'traveller', '(for', 'so', 'it', 'will', 'be', 'convenient', 'to', 'speak', 'of', 'him)']\n",
      "['was', 'expounding', 'a', 'recondite', 'matter', 'to', 'us.', 'his', 'grey', 'eyes', 'shone', 'and']\n",
      "['twinkled,', 'and', 'his', 'usually', 'pale', 'face', 'was', 'flushed', 'and', 'animated.', 'the']\n"
     ]
    }
   ],
   "source": [
    "#8.2.2. 词元化\n",
    "def tokenize(lines,token='word'):\n",
    "    if token == 'word':\n",
    "        return [line.split() for line in lines]\n",
    "    elif token == \"char\":\n",
    "        return [list(line) for line in lines]\n",
    "    else:\n",
    "        print(\"错误：未知词元类型：\" + token)\n",
    "tokens = tokenize(lines)\n",
    "for i in range(11):\n",
    "    print(tokens[i])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "63b67479-0268-4256-91f9-419c790ee80e",
   "metadata": {},
   "outputs": [],
   "source": [
    "#8.2.3. 词表\n",
    "class Vocab:\n",
    "    def __init__(self,tokens=None,min_freq=0,reserved_tokens=None):\n",
    "        if tokens is None:\n",
    "            tokens = []\n",
    "        if reserved_tokens is None:\n",
    "            reserved_tokens = []\n",
    "\n",
    "        counter = count_corpus(tokens)\n",
    "        self._token_freqs = sorted(counter.items(),key=lambda x:x[1],reverse=True)\n",
    "        self.idx_to_token = ['<unk>'] + reserved_tokens\n",
    "        self.token_to_idx = {token:idx for idx,token in enumerate(self.idx_to_token)}\n",
    "\n",
    "        for token,freq in self._token_freqs:\n",
    "            if freq < min_freq:\n",
    "                break\n",
    "            if token not in self.token_to_idx:\n",
    "                self.idx_to_token.append(token)\n",
    "                self.token_to_idx[token]  = len(self.idx_to_token) -1\n",
    "\n",
    "    def __len__(self):\n",
    "        return len(self.idx_to_token)\n",
    "    def __getitem__(self,tokens):\n",
    "        if not isinstance(tokens,(list,tuple)):\n",
    "            return self.token_to_idx.get(tokens,self.unk)\n",
    "        return [self.__getitem__(token) for token in tokens]\n",
    "\n",
    "    def to_tokens(self,indices):\n",
    "        if not isinstance(indices,(list,tuple)):\n",
    "            return self.idx_to_token[indices]\n",
    "        return [self.idx_to_token[index] for index in indices]\n",
    "\n",
    "\n",
    "    @property\n",
    "    def unk(self):\n",
    "        return 0\n",
    "    #property\n",
    "    def token_freqs(self):\n",
    "        return self._token_freqs\n",
    "\n",
    "def count_corpus(tokens):\n",
    "    if len(tokens) == 0 or isinstance(tokens[0],list):\n",
    "        tokens = [token for line in tokens for token in line]\n",
    "    return collections.Counter(tokens)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "35134249-a95a-4be9-8407-d7dae8087b4a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[('<unk>', 0), ('the', 1), ('and', 2), ('i', 3), ('of', 4), ('a', 5), ('to', 6), ('was', 7), ('in', 8), ('my', 9)]\n"
     ]
    }
   ],
   "source": [
    "vocab = Vocab(tokens)\n",
    "print(list(vocab.token_to_idx.items())[:10])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "fde2ac8b-6851-4c4e-80aa-d735b82161c1",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "文本： ['the', 'time', 'machine,', 'by', 'h.', 'g.', 'wells', '[1898]']\n",
      "索引： [1, 18, 197, 37, 2413, 2414, 755, 2415]\n",
      "文本： ['twinkled,', 'and', 'his', 'usually', 'pale', 'face', 'was', 'flushed', 'and', 'animated.', 'the']\n",
      "索引： [2419, 2, 22, 996, 442, 154, 7, 1410, 2, 1411, 1]\n"
     ]
    }
   ],
   "source": [
    "for i in [0,10]:\n",
    "    print(\"文本：\",tokens[i])\n",
    "    print(\"索引：\",vocab[tokens[i]])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "f36cb9b3-03d4-446d-a194-b2d49af56c45",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(175758, 45)"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#8.2.4. 整合所有功能\n",
    "def load_corpus_time_machine(max_tokens=-1):\n",
    "    lines = read_time_machine()\n",
    "    tokens = tokenize(lines,'char')\n",
    "    vocab = Vocab(tokens)\n",
    "\n",
    "    corpus = [vocab[token] for line in tokens for token in line]\n",
    "    if max_tokens > 0:\n",
    "        corpus = corpus[:max_tokens]\n",
    "    return corpus,vocab\n",
    "\n",
    "corpus,vocab = load_corpus_time_machine()\n",
    "len(corpus),len(vocab)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "82b29d25-0a86-4dcc-bb98-81b878cd7026",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.17"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
